/*******************************************************************************
																				
	DESCRIPTION: 	This do file cleans the data from LISA.

*******************************************************************************/

clear all
global id_code 001_2

********************************************************************************
* A1: LISA data - appending the data
********************************************************************************
/* First step: pre-clean all years, keeping only the variables we are interested in
Need to generate some missing variables to ensure that the command keep doesn't return an error*/ 
forval y=1990/2017 {
	di `y'
	use "${path_LISA}/Lisa_`y'.dta", clear
	if `y'<=1991 {
		gen ALosDag = .
	}
	if `y'<=1993 {
		gen SjukSum_Bdag_Midas = .
	}
	if `y'<2001{
		gen AstSNI2002 = ""
		gen AstSNI2007 = ""
		gen Ssyk4 = ""
	}
	if `y'==2001{
		gen AstSNI2002 = ""
		gen AstSNI2007 = ""
	}

	if `y'>=2005 {
		gen Barn18plus = Barn18_19 + Barn20plus
	}
	if `y'>=2002 & `y'<=2006{
		gen AstSNI92 = ""
	    gen AstSNI2007 = ""
	}	
	if `y'>=2007{
		gen AstSNI92 = ""
	}
	if `y'>=2011{
		gen AstSNI2002 = ""
	}
	if `y'<=2013 {
	    gen Ssyk4_2012 = ""
	}
	if `y'>=2014 {
	    gen Ssyk4 = ""
	}
	if `y'>=2016 {
	    destring LoneInk ForvErs ALosDag, replace
	}
	
	gen Ar = `y'

	keep LopNr_PersonNr LopNr_FamId LopNr_PeOrgNr_LISA Ar Alder SenInvAr Kon Kommun MedbGrEg FamTypF Sun2000niva AstSNI92 AstSNI2002 AstSNI2007 ForvErs LoneInk Ssyk4 Ssyk4_2012 SjukSum_Bdag_Midas ALosDag Barn0_3 Barn4_6 Barn7_10 Barn11_15 Barn16_17 Barn18plus YrkStalln SyssStat*
	
	destring SyssStat*, replace
	
	duplicates drop LopNr_PersonNr Ar, force
	
	compress
	save "${data}/${id_code}_Lisa_`y'_tidy.dta", replace
	
}


* Append all the LISA data together for year 1990-2017
use "${data}/${id_code}_Lisa_1990_tidy.dta", clear

forval y=1991/2017{
    di `y'
	append using "${data}/${id_code}_Lisa_`y'_tidy.dta"
}

compress
save "${data}/${id_code}_Lisa_allYears.dta", replace


********************************************************************************
* A2: LISA data - create variables that require the full dataset (not just the unemployed)
********************************************************************************
use  "${data}/${id_code}_Lisa_allYears.dta", clear
compress

* Income of other family members
bysort LopNr_FamId Ar: egen FamInc = total(ForvErs)
replace FamInc = FamInc - ForvErs
label var FamInc "Income of other family members"
drop LopNr_FamId

* Size of employer
bys LopNr_PeOrgNr Ar: egen nEmployees=count(LopNr_PersonNr) if LopNr_PeOrgNr!=.

* Change in the firm size

	preserve
	
	* Keep one observation and firm and year
	keep LopNr_PeOrgNr Ar nEmployees
	duplicates drop 
	
	* Generate firm size change between t and t-1
	xtset LopNr_PeOrgNr Ar 
	gen firmSizeChange=(nEmployees-L.nEmployees)/L.nEmployees 
	drop nEmployees
	
	* Save as tempfile
	tempfile temp
	save `temp'

	restore
	
	* Merge back to the rest of the data
	merge m:1 LopNr_PeOrgNr Ar using `temp' 
	drop if _merge==2
	drop _merge

* Layoff risk

	preserve
	
	* Generate an indicator that individual works for the same company next year
	sort LopNr_PersonNr Ar	
	gen worksNextYear= ///
	(LopNr_PeOrgNr[_n]==LopNr_PeOrgNr[_n+1] & LopNr_PeOrgNr[_n]!=. ///
	& Ar[_n]+1==Ar[_n+1] ///
	& LopNr_PersonNr[_n]==LopNr_PersonNr[_n+1])
	
	* Generate number of employees that continue work next year by company
	bys LopNr_PeOrgNr Ar: egen nWorksNextYear=total(worksNextYear) if LopNr_PeOrgNr!=.
	
	* Generate share of employees that leave the company next year	
	bys LopNr_PeOrgNr Ar: gen layoffRate=1-(nWorksNextYear/_N) if LopNr_PeOrgNr!=.
	
	* Keep one observation per firm and year
	duplicates drop LopNr_PeOrgNr Ar, force
	drop worksNextYear nWorksNextYear
	
	* Save as tempfile	
	tempfile temp2
	save `temp2'
	
	restore

	* Merge back to the rest of the data
	merge m:1 LopNr_PeOrgNr Ar using `temp2'
	drop if _merge==2
	drop _merge

********************************************************************************
* A3: LISA data - Keep only individuals who show up in our unemployed sample 
*******************************************************************************
preserve

use "${data}/001_1_UnemploymentSpells.dta", clear 

keep LopNr_PersonNr
duplicates drop

tempfile temp 
save `temp'

restore

merge m:1 LopNr_PersonNr using `temp'
keep if _merge == 3
drop _merge

save "${data_intermediate}/${id_code}_LISA_clean.dta", replace


********************************************************************************
* A4: LISA data - renaming and cleaning variables 
********************************************************************************

use "${data_intermediate}/${id_code}_LISA_clean.dta", clear

******************************** Demographics *********************************

* Drop variables that are used to calculate unemployment rate
* but not needed in the model
drop SyssStat*

* Year
rename Ar year 

* Age
rename Alder age

* Gender
rename Kon Gender
destring Gender, replace
replace Gender = Gender - 1
label define Gender 0 "Male" 1 "Female"
label values Gender Gender

* Family type
g FamType=.
	replace FamType=. if FamTypF=="00" // contradictory info
	replace FamType=2 if FamTypF=="11"
	replace FamType=3 if FamTypF=="12"
	replace FamType=4 if FamTypF=="13"
	replace FamType=5 if FamTypF=="21"
	replace FamType=6 if FamTypF=="22"
	replace FamType=7 if FamTypF=="23"
	replace FamType=8 if FamTypF=="31"
	replace FamType=9 if FamTypF=="32"
	replace FamType=10 if FamTypF=="41"
	replace FamType=11 if FamTypF=="42"
	replace FamType=12 if FamTypF=="50"
	replace FamType=. if FamTypF=="60" // children incorrectly registered
drop FamTypF

label define FamilyType 2 "Married, no kids at home" 
label define FamilyType 3 "Married, min one kid under 18 at home", add
label define FamilyType 4 "Married, kid above 18 at home", add
label define FamilyType 5 "cohabitant, no kids", add
label define FamilyType 6 "cohabitant, min one kid under 18 at home", add
label define FamilyType 7 "cohabitant, kid above 18 at home", add
label define FamilyType 8 "single father, under 18", add
label define FamilyType 9 "single father, above 18", add
label define FamilyType 10 "single mother, under 18", add
label define FamilyType 11 "single mother, above 18", add
label define FamilyType 12 "single, no dependents", add

label values FamType FamilyType	


* Civil status 
tab FamType, missing
gen civilStatus=.
replace civilStatus=1 if FamType>=2 & FamType<=4
replace civilStatus=2 if FamType>=8 & FamType<=12
replace civilStatus=3 if FamType>=5 & FamType<=7
label define CivilStat 1 "Married" 
label define CivilStat 2 "Single", add 
label define CivilStat 3 "Cohabiting", add
label values civilStatus CivilStat
tab civilStatus, missing
drop FamType

* Number of children
gen N_Kids = Barn0_3 + Barn4_6 + Barn7_10 + Barn11_15 + Barn16_17 + Barn18plus
gen N_Kids_U18 = Barn0_3 + Barn4_6 + Barn7_10 + Barn11_15 + Barn16_17
label variable N_Kids_U18 "Number of kids under 18"

* Age of youngest child (this is a grouped variable)
gen Age_Youngest = .
replace Age_Youngest = 18 if Barn18plus>0
replace Age_Youngest = 16 if Barn16_17>0
replace Age_Youngest = 11 if Barn11_15>0
replace Age_Youngest = 7 if Barn7_10>0
replace Age_Youngest = 4 if Barn4_6>0
replace Age_Youngest = 0 if Barn0_3>0
 
replace Age_Youngest = 9 if N_Kids==0 // replace with . if no kids
drop Barn*

* Education Level
gen EducLevel=substr(Sun2000niva,1,1)
destring EducLevel, replace
drop Sun2000niva
tab EducLevel, missing
replace EducLevel=. if EducLevel==9
tab EducLevel, missing


******************************** Migration *********************************

* Foreign individual
destring MedbGrEg, replace
tab MedbGrEg, missing // there are 12.5 K observations missing
gen foreign =.
replace foreign = 0 if MedbGrEg == 0
replace foreign = 1 if MedbGrEg<=11 & MedbGrEg>=1
label define foreigner 0 "Swedish citizen"
label define foreigner 1 "Foreign citizen", add
label values foreign foreigner

rename MedbGrEg citizenship
label define citizen 0 "Sweden" 
label define citizen 1 "Nordic", add
label define citizen 2 "EU15", add
label define citizen 3 "Europe other", add
label define citizen 4 "Africa", add
label define citizen 5 "N America", add
label define citizen 6 "S America", add
label define citizen 7 "Asia", add
label define citizen 8 "Oceania", add
label define citizen 9 "Soviet U", add
label define citizen 11 "Other", add
label values citizenship citizen

* Year migrated
rename SenInvAr yearMigrated
destring yearMigrated, replace

* Generate cohort of migration
gen migrationCohort = yearMigrated
replace migrationCohort = 2000 if yearMigrated < 2010 & yearMigrated >= 2000
replace migrationCohort = 1990 if yearMigrated < 2000 & yearMigrated >= 1990
replace migrationCohort = 1980 if yearMigrated < 1990 & yearMigrated >= 1980
replace migrationCohort = 1900 if yearMigrated < 1980 & yearMigrated >= 1900

label define migCohort 2010 "Migrated in 2010" 
forval y = 2011(1)2017 {
label define migCohort `y' "Migrated in `y'" , add
}
label define migCohort 2000 "Migrated in 2000-2010" , add
label define migCohort 1990 "Migrated in 1990-2000" , add
label define migCohort 1980 "Migrated in 1980-1990" , add
label define migCohort 1900 "Migrated before 1980" , add
label values migrationCohort migCohort

******************************** Income *********************************

label var ForvErs "earned income and work related benefits"

rename LoneInk WageInc
label var WageInc "gross cash salary"

gen OtherInc = ForvErs - WageInc
label var OtherInc "earned income excl. gross cash salary"

drop ForvErs

* CPI adjusted income
merge m:1 year using "${data}/Inflation.dta"
drop if _merge ==2
drop _merge

foreach var in WageInc OtherInc FamInc {
	gen `var'_adj=`var'/CPI*100
	drop `var'
}

label var FamInc_adj "income of other family members (2006 SEK)"
label var WageInc_adj "gross cash salary (2006 SEK)"
label var OtherInc_adj "earned income excl. gross cash salary (2006 SEK)"
drop CPI

******************************** Employment *********************************

* Firm 
rename LopNr_PeOrgNr_LISA firmLISA
codebook firmLISA // firm is missing for 19 443 443 observations (21 % observations)

* Employment status
rename YrkStalln emplStatu
destring emplStatu, replace
replace emplStatu=1 if emplStatu>=1 & emplStatu<=5
label define empl 0 "No taxable income" 
label define empl 1 "Any taxable income", add
label values emplStatu empl

* Days in unemployment and receiving benefits
rename ALosDag DaysUnemp
rename SjukSum_Bdag_Midas DaysOnDI
label var DaysUnemp "days spent in unempl in a year"	
label var DaysOnDI "days in sickness etc. benefits in a year"

******************************** Industry *********************************

* Harmonise industry codes
destring AstSNI92 AstSNI2002 AstSNI2007, replace

rename AstSNI92 SNI92_old
merge m:1 SNI92_old using "${data}/006_SNI92_new.dta"
drop if _merge==2
drop _merge
drop SNI92_old

rename AstSNI2002 SNI2002_old
merge m:1 SNI2002_old using "${data}/006_SNI2002_new.dta"
drop if _merge==2
replace SNI2002_new = SNI2002_old if SNI2002_new==.
drop _merge
drop SNI2002_old

rename AstSNI2007 SNI2007_old
merge m:1 SNI2007_old using "${data}/006_SNI2007_new.dta"
drop if _merge==2
drop _merge
drop SNI2007_old

gen Industry_5digit = SNI92_new
replace Industry_5digit = SNI2002_new if Industry_5digit==.
replace Industry_5digit = SNI2007_new if Industry_5digit==.

* Create 2 and 3 digit industry codes
tostring Industry_5digit, gen(IndustryString)
gen Industry_3digit = substr(IndustryString,1,3)
destring Industry_3digit, replace 

drop IndustryString SNI* Industry_5digit

******************************** Occupation *********************************

* Occupation code
replace Ssyk4 = "" if Ssyk4=="****"
rename Ssyk4 Occupation_4D
gen Occupation_3D = substr(Occupation_4D,1,3) // available for 2002-2013
destring Occupation_3D, replace
drop Occupation_4D 

/* For now we only need the occupation variable in 2006 
so we drop the redundant variables */
drop Ssyk4_2012*


******************************** Municipality *********************************

* Municipality
rename Kommun Municipality 
destring Municipality, replace

* Transform municipality codes
	* In 1991, parts of 0480 became new municipalities 0461 and 0488.
	replace Municipality = 0480 if Municipality==0461 | Municipality==0488
	* In 1994, a part of 1583 became a new municipality 1535, which was
	* changed to 1443 in 1997.
	replace Municipality = 1583 if Municipality==1535 | Municipality==1443
	* In 1994, a part of 1880 became a new municipality 1814.
	replace Municipality = 1880 if Municipality==1814
	* In 1998, a part of 0181 became a new municipality 0140.
	replace Municipality = 0181 if Municipality==0140
	* In 2002, a part of 0380 became a new municipality 0330.
	replace Municipality = 0380 if Municipality==0330
	
	* 1996 changes
	replace Municipality = 1256 if Municipality==1121
	replace Municipality = 1257 if Municipality==1137
	replace Municipality = 1270 if Municipality==1160
	replace Municipality = 1272 if Municipality==1162
	replace Municipality = 1273 if Municipality==1163
	replace Municipality = 1275 if Municipality==1165
	replace Municipality = 1276 if Municipality==1166
	replace Municipality = 1277 if Municipality==1167
	replace Municipality = 1278 if Municipality==1168
	replace Municipality = 1290 if Municipality==1180
	replace Municipality = 1291 if Municipality==1181
	replace Municipality = 1292 if Municipality==1182
	replace Municipality = 1293 if Municipality==1183
	
	* 1997 changes
	replace Municipality = 0642 if Municipality==1622
	replace Municipality = 0643 if Municipality==1623
		
	replace Municipality = 1438 if Municipality==1504
	replace Municipality = 1439 if Municipality==1507
	replace Municipality = 1440 if Municipality==1521
	replace Municipality = 1441 if Municipality==1524
	replace Municipality = 1442 if Municipality==1527
	replace Municipality = 1443 if Municipality==1535 // Not needed given 1994 change
	replace Municipality = 1444 if Municipality==1602
	replace Municipality = 1445 if Municipality==1603
	replace Municipality = 1446 if Municipality==1637
	replace Municipality = 1447 if Municipality==1643
	replace Municipality = 1452 if Municipality==1552
	replace Municipality = 1460 if Municipality==1560
	replace Municipality = 1461 if Municipality==1561
	replace Municipality = 1462 if Municipality==1562
	replace Municipality = 1463 if Municipality==1563
	replace Municipality = 1465 if Municipality==1565
	replace Municipality = 1466 if Municipality==1566
	replace Municipality = 1470 if Municipality==1660
	replace Municipality = 1471 if Municipality==1661
	replace Municipality = 1472 if Municipality==1662
	replace Municipality = 1473 if Municipality==1663
	replace Municipality = 1487 if Municipality==1580
	replace Municipality = 1488 if Municipality==1581
	replace Municipality = 1489 if Municipality==1582
	replace Municipality = 1490 if Municipality==1583
	replace Municipality = 1491 if Municipality==1584
	replace Municipality = 1492 if Municipality==1585
	replace Municipality = 1493 if Municipality==1680
	replace Municipality = 1494 if Municipality==1681
	replace Municipality = 1495 if Municipality==1682
	replace Municipality = 1496 if Municipality==1683
	replace Municipality = 1497 if Municipality==1684
	replace Municipality = 1498 if Municipality==1685
	replace Municipality = 1499 if Municipality==1686
	
	* 2006 change
	replace Municipality = 0331 if Municipality==1917
	
save "${data_intermediate}/${id_code}_LISA_clean.dta", replace

********************************************************************************
* A5: LISA data - Creating variables that require lags
********************************************************************************

use "${data_intermediate}/${id_code}_LISA_clean.dta", clear

order Lop* year, first
xtset LopNr_PersonNr year 

******************************** Days in unemployment and DI ********************

* Days unemployed and in receipt of sickness benefit, occupational injury benefit or rehabilitationin previous 1-2 years and 1-5 years

foreach var in DaysUnemp DaysOnDI {
	forvalues i = 1/5 {
		gen L`i'_`var' = L`i'.`var'
	}
}

* Sum the days in unemployment and days in DI
pause on
foreach var in DaysUnemp DaysOnDI {
	
	foreach i in 1 2 3 4 5 {
	egen `var'_`i'Years = rowtotal(L1_`var' - L`i'_`var')
	}
	pause
	* replace the variable as missing if all the relevant observations are missing
	replace `var'_1Years=. if (L1_`var'==.)	
	replace `var'_2Years=. if (L1_`var'==. & L2_`var'==.)
	replace `var'_3Years=. if (L1_`var'==. & L2_`var'==. & L3_`var'==.)
	replace `var'_4Years=. if (L1_`var'==. & L2_`var'==. & L3_`var'==. & L4_`var'==.)
	replace `var'_5Years=. if (L1_`var'==. & L2_`var'==. & L3_`var'==. & L4_`var'==. & L5_`var'==.)
	pause
	* generate a variable indicating that the variable was created based on some missing observations
	* i.e., at least one observation used to create the variable was missing
	* there is a different variable created in 001_09 do file that indicates that all observations were missing
	gen `var'_1Years_Miss=(L1_`var'==.)
	gen `var'_2Years_Miss=(L1_`var'==. | L2_`var'==.)
	gen `var'_3Years_Miss=(L1_`var'==. | L2_`var'==. | L3_`var'==.)
	gen `var'_4Years_Miss=(L1_`var'==. | L2_`var'==. | L3_`var'==. | L4_`var'==.)
	gen `var'_5Years_Miss=(L1_`var'==. | L2_`var'==. | L3_`var'==. | L4_`var'==. | L5_`var'==.)
	pause
	drop L*_`var'
	drop `var'
}

foreach i in 1 2 3 4 5 {
	label var DaysUnemp_`i'Years "days in unempl in last `i'Y"
	label var DaysOnDI_`i'Years "days in DI in last `i'Y"
}

save "${data_intermediate}/${id_code}_LISA_clean2.dta", replace

****************************** Income history ******************************

use "${data_intermediate}/${id_code}_LISA_clean2.dta", clear

* Generate income history variables
foreach var in OtherInc_adj WageInc_adj FamInc_adj {
	forvalues i = 2/5 {
		gen L`i'_`var' = L`i'.`var'
	}
}

foreach var in OtherInc_adj WageInc_adj FamInc_adj {
	
	egen `var'_L2_L5 = rowtotal(L2_`var'- L5_`var')
	replace `var'_L2_L5 =. if (L2_`var'==. & L3_`var'==. & L4_`var'==. & L5_`var'==.)
	
	* gen variable indicating that at least one observation was missing
	* there is another variable generate in 001_9 that indicates that all variables were missing
	gen `var'_L2_L5_Miss=(L2_`var'==. | L3_`var'==. | L4_`var'==. | L5_`var'==.)
	
	forval i = 2/5 {
		drop L`i'_`var'
	}
}

*************************** Employment history ******************************

* Calcualate the number of years working for the same employer in the last 5 years

forvalues i=1/5 {
 gen L`i'_firmLISA = L`i'.firmLISA
}

* Can't do L_emplStatu for more years because only available from 1990
gen L_emplStatu = L1.emplStatu

* Calculate number of years individual was working for the employer for whom he/she was working in the year before becoming unemployed
gen tenure =.
replace tenure=0 if L_emplStatu==0
replace tenure=1 if L1_firmLISA!=.
replace tenure = 2 if L1_firmLISA==L2_firmLISA & L1_firmLISA!=.
replace tenure = 3 if L1_firmLISA==L2_firmLISA & L1_firmLISA==L3_firmLISA & L1_firmLISA!=.
replace tenure = 4 if L1_firmLISA==L2_firmLISA & L1_firmLISA==L3_firmLISA & L1_firmLISA==L4_firmLISA & L1_firmLISA!=.
replace tenure = 5 if L1_firmLISA==L2_firmLISA & L1_firmLISA==L3_firmLISA & L1_firmLISA==L4_firmLISA & firmLISA==L5_firmLISA & L1_firmLISA!=.


gen tenureMissing=(L1_firmLISA==. | L2_firmLISA==. | L3_firmLISA==. | L4_firmLISA==. | L5_firmLISA==.)

* Calculate the number of employers in the last 5 years
forval i = 1/5 {
	gen nEmployers`i'Y =.
	replace nEmployers`i'Y=0 if L_emplStatu==0
	replace nEmployers`i'Y=1 if L1_firmLISA!=.
}

forval i = 2/5 {
	replace nEmployers`i'Y = nEmployers`i'Y +1 if L1_firmLISA!=L2_firmLISA & L2_firmLISA!=.
}

forval i = 3/5 {
	replace nEmployers`i'Y = nEmployers`i'Y +1 if L2_firmLISA!=L3_firmLISA & L3_firmLISA!=.
}

forval i = 4/5 {
	replace nEmployers`i'Y = nEmployers`i'Y +1 if L3_firmLISA!=L4_firmLISA & L4_firmLISA!=.
}

replace nEmployers5Y = nEmployers5Y +1 if L4_firmLISA!=L5_firmLISA & L5_firmLISA!=.

drop L*_firmLISA emplStatu


****************************** Lags of selected variables **********************

* Generate lags of selected variables

foreach var in Municipality civilStatus N_Kids N_Kids_U18 Age_Youngest OtherInc_adj WageInc_adj FamInc_adj Industry_3digit {

	gen L_`var' = L.`var'
	drop `var'

}

********************************************************************************
* A6: LISA data - Replacing missing variables with lags
********************************************************************************

* Replace occupation, firm size, firm size change and layoff rate with a second lag if missing
foreach var in nEmployees firmSizeChange layoffRate Occupation_3D {
	
	forvalues i = 1/2 {
		gen L`i'_`var' = L`i'.`var'
	}
	
	gen L_`var'_L1L2 = L1_`var'
	replace L_`var'_L1L2= L2_`var' if L1_`var'==.
	
	drop `var' L1_`var' L2_`var'
}

********************************************************************************
* A7: Saving the dataset
********************************************************************************

compress
save "${data}/${id_code}_LISA_clean.dta", replace
